library(glmnet)
library(dplyr)
library(tidyverse)
library(edgeR)
library(limma)
library(AnnotationHub)
library(ensembldb)
library(biomaRt)
library(dbplyr) #must load this package to make sure AnnotationHub can grep the database
library(DT)
library(survival)
library(ggpubr)
library(survminer)
library(rstatix)
library(ezcox)

Establishing Lasso Cross-validation model to get CIN signature genes from TCGA PRAD transcriptomic data

back to home

Importing CN-signature 3 TCGA PRAD samples info

sig3_patient<-readRDS(file = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_tcga_cn_sig3_tcga_samples.rds")

Processing TCGA PRAD data downloaded from GDC data portal

TCGA_meta_data<-read_tsv(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/gdc_sample_sheet.2024-02-06.tsv", col_names = T)
TCGA_rna_read_counts_files<-list.files(path = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/read_count",
                                       pattern = "*counts.tsv",
                                       recursive = T,
                                       full.names = T)
meta_data<-as.data.frame(TCGA_meta_data)[, c("File Name", "Sample ID", "Case ID", "Sample Type")]
tumor_meta_data<-meta_data[meta_data$`Sample Type` !="Solid Tissue Normal", ]

read_count<-as.data.frame(read.table(TCGA_rna_read_counts_files[1], sep = "\t", header = T))
read_count<-read_count[5:nrow(read_count), c("gene_id", "gene_name", "unstranded")]
colnames(read_count)<-c("gene_id", "gene_name", basename(TCGA_rna_read_counts_files[1]))

for (i in 2:length(TCGA_rna_read_counts_files)){
  file_name<-basename(TCGA_rna_read_counts_files[i])
  df<-read.table(TCGA_rna_read_counts_files[i], sep = "\t", header = T)
  df<-df[5:nrow(df), c("gene_id", "gene_name", "unstranded")]
  colnames(df)<-c("gene_id", "gene_name", file_name)
  read_count<-left_join(read_count, df, by = c("gene_id", "gene_name"))
}

read_count<-readRDS("/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/read_count_unstranded_TCGA.rds")
samples<-colnames(read_count)[3:length(colnames(read_count))]
tumor_samples<-samples[samples %in% tumor_meta_data$`File Name`]
tumor_matrix<-read_count[, c("gene_id","gene_name",tumor_meta_data$`File Name`)]
colnames(tumor_matrix)<-c("gene_id", "gene_name", tumor_meta_data$`Sample ID`)
dim(tumor_matrix)

# Normalize gene expression matrix.
group<-factor(colnames(tumor_matrix)[3:length(colnames(tumor_matrix))])
counts<-tumor_matrix[,3:ncol(tumor_matrix)]
rownames(counts)<-tumor_matrix$gene_id
  
y <-DGEList(counts = counts, group = group)
keep <- rowSums(cpm(y) > 1) >= length(group)/2
y <- y[keep, keep.lib.sizes = FALSE]
y <- calcNormFactors(y, method = "TMM")
design <- model.matrix(~0 + group)
colnames(design)<-gsub("group", "", colnames(design))
temp <- voom(y, design, plot = T)
edat <- temp$E
edat <- as.data.frame(edat)

Establishing model

sig3_patient<-substr(sig3_patient, 1, nchar(sig3_patient)-3)
edat<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/TCGA/tcga_prad_tumor_edat.rds")
colnames(edat)<-unlist(lapply(colnames(edat), function(x) substr(x, 1, 12)))
library(glmnet)
labels<-ifelse(colnames(edat) %in% sig3_patient, 1, 0)

#prepare data
x = as.matrix(t(edat))
y = labels

#perform lasso regression with cross-validation
set.seed(1234)
lasso_model<-cv.glmnet(x, y, alpha = 1, nfolds = 10) #ALPHA = 1 for lasso

#Extract selected features (genes) based on the optimal lambda
selected_genes<-coef(lasso_model, s = "lambda.min")

non_zero_genes <- rownames(selected_genes)[which(selected_genes != 0)]

ensembl<-useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
ensemblId<-non_zero_genes[-1] #exclude intercept
ensemblId<-unlist(lapply(ensemblId, function(x) strsplit(x, "[.]")[[1]][1]))

library(dbplyr)
signature_gene_annotation<-biomaRt::getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol","chromosome_name","band","description","start_position","end_position", "entrezgene_id"),
                        values=ensemblId, mart=ensembl)
signature_gene_annotation<-signature_gene_annotation[signature_gene_annotation$hgnc_symbol != "",]
DT::datatable(signature_gene_annotation)

Importing CIN signature genes and curating gene names to follow gene symbol

CIN_genes<-readxl::read_excel("/xdisk/mpadi/jiawenyang/result/centrosome_loss/lasso_signature_genes/CIN_signature_genes.xlsx")
#CIN70
CIN70<-CIN_genes$CIN70 #Orginal list from published paper: https://www.nature.com/articles/ng1861. Need to adjust gene names to their formal human gene symbol.
CIN70[61]<-"GPI" 
CIN70[4]<-"CDCA2" #Cell division cycle-associated protein, was CDC2 in the list.
CIN70[13]<-"CEP250" #Centrosome Nek2-associated protein 1, was CNAP1
CIN70[18]<-"CDC45" #Cell division cycle 45 -like was CDC45L
CIN70[46]<-"NCAPH" #Non-SMC condensin I complex subunit H. was BRRN1
CIN70[49]<-"NCAPG2" #Non-SMC condensin II complex subunit G2. was MTB 
CIN70[52]<-"PBK" #PDZ binding kinase gene was TOPK
CIN70[62]<-"SRSF2" #Serine And arginine rich splicing factor 2.was SFRS2
CIN70[67]<-"AURKA" #Aurora kinase A.was STK6
CIN70[69]<-"TMEM194A" #Nuclear envelope integral membrane protein 1.was KIAA0286

#CIN25
CIN25<-na.omit(CIN_genes$CIN25)#Orginal list from published paper: https://www.nature.com/articles/ng1861.
CIN25[3]<-"CDC45" #Cell division cycle 45 -like was CDC45L
CIN25[6]<-"CDK1" #Cyclin dependent kinease 1, was CDC2.
CIN25[14]<-"CEP250" #Centrosome Nek2-associated protein 1, was CNAP1

#CA20
CA20<-na.omit(CIN_genes$CA20) #gene names are all formal gene symbol.

#CIN23
CIN23<-na.omit(CIN_genes$CIN23) #gene names are all formal gene symbol.#GTF2IP7 and CD24 are not included in the dataset

#CIN7
CIN7<-na.omit(CIN_genes$CIN7) #gene names are all formal gene symbol.

#CN-sig3
CN_sig3<-na.omit(CIN_genes$`CN-sig3 genes_with_All_TCGA_prad_genes`) #gene names are all formal gene symbol.

Back to top

Performing clinical outcomes association analysese in independent PCa cohorts with samples classified using CIN gene sets

back to home

Performing analyses in DKFZ 2018 Cancer Cell dataset

Performing kaplan-meier survival analysis in DKFZ 2018 Cancer Cell data

dkfz_df<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_mrna_seq_rpkm_zscores_ref_all_samples.txt",sep = "\t", header = T)
zdat<-dkfz_df[,c(1,3:ncol(dkfz_df))]
sample_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_clinical_sample.txt", sep = "\t", header = T)
patient_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/DKFZ_2018/data_clinical_patient.txt", sep = "\t", header = T)


CIN.gene.set<-c("CIN70", "CIN25", "CIN23", "CIN7", "CA20", "CN_sig3")
DKFZ_CIN_zscore_matrix<-list()
DKFZ_CIN_clincal_matrix<-list()
for (i in 1:length(CIN.gene.set)){
  CIN.name<-CIN.gene.set[i]
  cn_signature_genes <- get(CIN.gene.set[i])
  subzdat<-zdat[zdat$Hugo_Symbol %in% cn_signature_genes,]
  clinical_metadata<-left_join(sample_metadata, patient_metadata, by = "PATIENT_ID")
  duplicated_samples<-clinical_metadata[duplicated(clinical_metadata$PATIENT_ID), "SAMPLE_ID"]
  subzdat<-subzdat[, !colnames(subzdat) %in% intersect(duplicated_samples, colnames(subzdat))]
  altered_group<-c()
  for (k in 1:nrow(subzdat)){
    sample_id<-colnames(subzdat[,2:ncol(subzdat)])[which(abs(subzdat[k, 2:ncol(subzdat)]) >= 2)] #absolute z-score greater than 2 is defined as altered.
    altered_group<-c(altered_group, sample_id)
  }
  altered_group<-unique(altered_group)
  unaltered_group<-unique(colnames(subzdat)[-1][!colnames(subzdat)[-1] %in% altered_group])
  subzdat_ordered<-subzdat[, c(altered_group, unaltered_group)]
  rownames(subzdat_ordered)<-subzdat$Hugo_Symbol
  DKFZ_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered

  mrnaseq_clinical_metadata<-clinical_metadata[clinical_metadata$SAMPLE_ID %in% colnames(subzdat)[2:ncol(subzdat)],]
  alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
  unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
  alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
  unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
  clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
  DKFZ_CIN_clincal_matrix[[CIN.name]]<-clinical_data
  clinical_data_KM<-clinical_data
  BCR_curve<-Surv(clinical_data_KM$TIME_FROM_SURGERY_TO_BCR_LASTFU, clinical_data_KM$BCR_STATUS)
  BCR_Sfit<-survfit(Surv(TIME_FROM_SURGERY_TO_BCR_LASTFU, BCR_STATUS)~group, data = clinical_data_KM)

  dkfz_prad_relapse<-ggsurvplot(BCR_Sfit, 
                              conf.int = F, 
                              pval = T, 
                              risk.table = T, 
                              legend.labs = c("altered", "unaltered"),
                              legend.title = "CIN signature genes",
                              xlab = "Time (month)",
                              title = paste0("DKFZ Prostate cancer patients BCR-free survival \n ", CIN.name, " alteration"),
                              palette = c("orange", "#5BC0EB"),
                              font.tickslab = c(15),
                              risk.table.height = .25)
  print(dkfz_prad_relapse)
}

Performing association analyses in DKFZ 2018 Cancer Cell data with CN-signature 3 associated genes

#GLEASON score
library(RColorBrewer)

col_values = rev(brewer.pal(n = 7, name = "RdBu"))
col_values[4] <- "#e7e689"

clinical_data_gleason<-clinical_data %>% group_by(group) %>% dplyr::count(GLEASON_SCORE)
GLEASON_df<-data.frame("altered" = c(1, 8, 5, 0, 4, 5, 0), "unaltered" = c(12, 48, 6, 1, 2, 1, 1),
                       row.names = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"))

p_gleason <- ggplot(data=clinical_data_gleason, 
            aes(x=factor(group, levels = c("altered", "unaltered")), 
                y=n, 
                fill=GLEASON_SCORE)) +
    scale_fill_manual(labels = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"), values=col_values) +
    geom_bar(position = "fill", stat="identity", width = 0.6) +
    geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
    xlab("sample groups") +                 
    ylab("Percentage of each sample type") +   
    ggtitle("Associate CIN signatures genes expression with patients PCa grades") +
    theme_bw () +
    theme(title = element_text(face = "bold"),
          legend.title = element_text(face = "bold"),
          legend.text = element_text(face = "bold"),
          axis.text.x = element_text(face = "bold"),
          axis.text.y = element_text(face = "bold"),
          axis.title.x = element_text(face = "bold"),
          strip.text.x = element_blank(),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          legend.position = "right",
          panel.spacing = unit(0,'lines'))

p_gleason

#GLEASON score
clinical_data_gleason<-clinical_data %>% group_by(group) %>% dplyr::count(GLEASON_SCORE)
GLEASON_df<-data.frame("altered" = c(1, 8, 5, 0, 4, 5, 0), "unaltered" = c(12, 48, 6, 1, 2, 1, 1),
                       row.names = c("3+3", "3+4", "4+3", "4+4", "4+5", "5+4", "5+5"))
print(fisher.test(GLEASON_df))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  GLEASON_df
## p-value = 0.000167
## alternative hypothesis: two.sided
p_purity<-ggplot(data = clinical_data,
              aes(x = factor( group, levels = c("altered", "unaltered")),
                  y = MEDIAN_PURITY,
                  fill = group,
                  color = group)) +
        theme_light() +
        geom_boxplot(alpha = 0.4) +
        geom_jitter(shape = 16, position = position_jitter(0.2)) +
        scale_fill_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
        scale_color_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
        stat_compare_means(label.y =70, label.x = 2, method = "t.test")+
        labs(title="Associate CIN signatures genes expression with PCa sample purity", x="patient groups", y = "sample purity")+
        theme(plot.title = element_text(size=20,  face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), 
        axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15,  face = "bold"))

p_purity

Performing analyses in MSKCC 2010 Cancer Cell dataset

Performing kaplan-meier survival analysis in MSKCC 2010 Cancer Cell data

prad_mskcc<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_mrna_agilent_microarray_zscores_ref_diploid_samples.txt", sep = "\t", header = T)

Entrez_id<-prad_mskcc$Entrez_Gene_Id
genes_annotation<-biomaRt::getBM(filters= "entrezgene_id", attributes= c("ensembl_gene_id","hgnc_symbol","chromosome_name","band","description","start_position","end_position", "entrezgene_id"),
                        values=Entrez_id, mart=ensembl)

zdat<-left_join(prad_mskcc, genes_annotation, by = c("Entrez_Gene_Id" = "entrezgene_id"))
zdat<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/lasso_signature_genes/prad_mskcc_zdat.rds")
sample_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_clinical_sample.txt", sep = "\t", header = T)
patient_metadata<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_mskcc/data_clinical_patient.txt", sep = "\t", header = T)
clinical_metadata<-left_join(sample_metadata, patient_metadata, by = "PATIENT_ID")

MSKCC_CIN_zscore_matrix<-list()
MSKCC_CIN_clincal_matrix<-list()
for(i in 1:length(CIN.gene.set)){
  CIN.name<-CIN.gene.set[i]
  cn_signature_genes <- get(CIN.gene.set[i])
  subzdat<-zdat[zdat$hgnc_symbol %in% cn_signature_genes, c(153, 2:151)] #variables
  subzdat<-subzdat[!duplicated(subzdat$hgnc_symbol),]
  altered_group<-c()
  for (k in 1:nrow(subzdat)){
  sample_id<-colnames(subzdat[,2:ncol(subzdat)])[which(abs(subzdat[k, 2:ncol(subzdat)]) >= 3)] #absolute z-score greater than 3 is defined as altered.
  altered_group<-c(altered_group, sample_id)
  }
  altered_group<-unique(altered_group)
  unaltered_group<-unique(colnames(subzdat)[-1][!colnames(subzdat)[-1] %in% altered_group])
  subzdat_ordered<-subzdat[, c(altered_group, unaltered_group)]
  rownames(subzdat_ordered)<-subzdat$hgnc_symbol
  MSKCC_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered

  mrnaseq_clinical_metadata<-clinical_metadata[clinical_metadata$SAMPLE_ID %in% colnames(zdat)[2:nrow(zdat)],]
  alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
  unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
  alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
  unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
  clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
  clinical_data$DFS_STATUS<-as.numeric(substr(clinical_data$DFS_STATUS, 1, 1))
  MSKCC_CIN_clincal_matrix[[CIN.name]]<-clinical_data
  
  #kaplan meier analysis
  clinical_data_KM<-clinical_data[!is.na(clinical_data$DFS_STATUS),]
  DF_curve<-Surv(clinical_data_KM$DFS_MONTHS, clinical_data_KM$DFS_STATUS)
  DF_Sfit<-survfit(Surv(DFS_MONTHS, DFS_STATUS)~group, data = clinical_data_KM)
  
  mskcc_prad_relapse<-ggsurvplot(DF_Sfit, 
                              conf.int = F, 
                              pval = T, 
                              risk.table = T, 
                              legend.labs = c("altered", "unaltered"),
                              legend.title = "CIN signature genes",
                              xlab = "Time (month)",
                              title = paste0("MSKCC Prostate cancer patients Disease-free survival \n ", CIN.name, " alteration"),
                              palette = c("orange", "#5BC0EB"),
                              font.tickslab = c(15),
                              risk.table.height = .25)
  print(mskcc_prad_relapse)
}

Performing association analyses in MSKCC 2010 Cancer Cell data with CN-signature 3 associated genes

library(RColorBrewer)
# Associate with Gleason Score
clinica_data_gleason<-clinical_data
clinica_data_gleason[, "GLEASON_SCORE_BOTH_SITES"]<-paste0(clinical_data$GLEASON_SCORE_1, "+", clinical_data$GLEASON_SCORE_2)
clinical_data_gleason<-clinica_data_gleason %>% group_by(group) %>% dplyr::count(GLEASON_SCORE_BOTH_SITES)
gsub("NA+NA", "NA", clinical_data_gleason$GLEASON_SCORE_BOTH_SITES)
##  [1] "3+3"   "3+4"   "4+3"   "4+4"   "4+5"   "5+3"   "NA+NA" "3+3"   "3+4"  
## [10] "3+5"   "4+3"   "4+4"   "4+5"   "5+3"   "NA+NA"
GLEASON_df<-data.frame("altered" = c(9, 15, 0, 7, 5, 8, 1, 9), "unaltered" = c(32, 38, 1, 16, 3, 3, 1, 2),
                       row.names = c("3+3", "3+4", "3+5", "4+3", "4+4", "4+5", "5+3", "NA+NA"))
fisher.test(GLEASON_df)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  GLEASON_df
## p-value = 0.0003428
## alternative hypothesis: two.sided
col_values = rev(brewer.pal(n = 8, name = "RdBu"))
col_values[8] <- alpha("grey", 0.6)

p_gleason <- ggplot(data=clinical_data_gleason, 
            aes(x=factor(group, levels = c("altered", "unaltered")), 
                y=n, 
                fill=GLEASON_SCORE_BOTH_SITES)) +
    scale_fill_manual(labels = c("3+3", "3+4", "3+5", "4+3", "4+4", "4+5", "5+3", "NA+NA"), values=col_values) +
    geom_bar(position = "fill", stat="identity", width = 0.6) +
    geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
    xlab("sample groups") +                 
    ylab("Percentage of each sample type") +   
    ggtitle("Associate CIN signatures genes expression with patients PCa grades") +
    theme_bw () +
    theme(title = element_text(face = "bold"),
          legend.title = element_text(face = "bold"),
          legend.text = element_text(face = "bold"),
          axis.text.x = element_text(face = "bold"),
          axis.text.y = element_text(face = "bold"),
          axis.title.x = element_text(face = "bold"),
          strip.text.x = element_blank(),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          legend.position = "right",
          panel.spacing = unit(0,'lines'))

p_gleason

library(RColorBrewer)
# Associate with sample types- metastasis or primary
clinical_data_type<-clinical_data %>% group_by(group) %>% dplyr::count(SAMPLE_TYPE)
type_df<-data.frame("altered" = c(17, 37), "unaltered" = c(2, 94), row.names = c("Metastasis", "Primary"))
print(fisher.test(type_df))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  type_df
## p-value = 4.049e-07
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    4.661771 197.138709
## sample estimates:
## odds ratio 
##    21.1432
col_values = rev(brewer.pal(n = 8, name = "RdBu"))
col_values_type<-col_values[c(6, 3)]

p_sample_type <- ggplot(data=clinical_data_type, 
            aes(x=factor(group, levels = c("altered", "unaltered")), 
                y=n, 
                fill=SAMPLE_TYPE)) +
    scale_fill_manual(labels = c("Metastasis", "Primary"), values=col_values_type) +
    geom_bar(position = "fill", stat="identity", width = 0.6) +
    geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
    xlab("sample groups") +                 
    ylab("Percentage of each sample type") +   
    ggtitle("Associate CIN signatures genes expression with patients tumor type") +
    theme_bw () +
    theme(title = element_text(face = "bold"),
          legend.title = element_text(face = "bold"),
          legend.text = element_text(face = "bold"),
          axis.text.x = element_text(face = "bold"),
          axis.text.y = element_text(face = "bold"),
          axis.title.x = element_text(face = "bold"),
          strip.text.x = element_blank(),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          #strip.background =element_rect(fill="white"),
          legend.position = "right",
          panel.spacing = unit(0,'lines'))

p_sample_type

## Performing analyses in SU2C/DREAM TEAM 2018 PNAS (mCRPC) dataset

Performing kaplan-meier survival analysis in SU2C/DREAM TEAM 2018 PNAS (mCRPC) dataset

su2c_patient_data<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_clinical_patient.txt", sep = "\t", header = T)
su2c_sample_data<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_clinical_sample.txt", sep = "\t", header = T)
su2c_clinical_data<-dplyr::left_join(su2c_patient_data, su2c_sample_data, by = c("PATIENT_ID"))
duplicated_samples<-su2c_clinical_data[duplicated(su2c_clinical_data$PATIENT_ID), "SAMPLE_ID"]

su2c_zdat<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/data_mrna_seq_fpkm_capture_zscores_ref_all_samples.txt", sep = "\t", header = T)
colnames(su2c_zdat)<-gsub("[.]", "-", colnames(su2c_zdat))
su2c_zdat<-su2c_zdat[, !colnames(su2c_zdat) %in% duplicated_samples]

SU2C_CIN_zscore_matrix<-list()
SU2C_CIN_clincal_matrix<-list()
for (i in 1:length(CIN.gene.set)){
  CIN.name<-CIN.gene.set[i]
  cn_signature_genes <- get(CIN.gene.set[i])
  su2c_zdat_subset<-su2c_zdat[su2c_zdat$Hugo_Symbol %in% cn_signature_genes, c(1, which(colnames(su2c_zdat) %in% su2c_clinical_data[,"SAMPLE_ID"]))]
  su2c_zdat_subset<-su2c_zdat_subset[!duplicated(su2c_zdat_subset$Hugo_Symbol),]
  altered_group<-c()
  for (k in 1:nrow(su2c_zdat_subset)){
    sample_id<-colnames(su2c_zdat_subset[,2:ncol(su2c_zdat_subset)])[which(abs(su2c_zdat_subset[k, 2:ncol(su2c_zdat_subset)]) >= 2)]
    altered_group<-c(altered_group, sample_id)
  }
  altered_group<-unique(altered_group)
  unaltered_group<-unique(colnames(su2c_zdat_subset)[-1][!colnames(su2c_zdat_subset)[-1] %in% altered_group])
  subzdat_ordered<-su2c_zdat_subset[, c(altered_group, unaltered_group)]
  rownames(subzdat_ordered)<-su2c_zdat_subset$Hugo_Symbol
  SU2C_CIN_zscore_matrix[[CIN.name]]<-subzdat_ordered
  
  mrnaseq_clinical_metadata<-su2c_clinical_data[su2c_clinical_data$SAMPLE_ID %in% colnames(su2c_zdat_subset)[2:ncol(su2c_zdat_subset)],]
  alt_sample_metadata<-mrnaseq_clinical_metadata[mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group), ]
  unalt_sample_metadata<-mrnaseq_clinical_metadata[!mrnaseq_clinical_metadata$SAMPLE_ID %in% unique(altered_group),]
  alt_sample_metadata[, "group"]<-rep("altered", nrow(alt_sample_metadata))
  unalt_sample_metadata[,"group"]<-rep("unaltered", nrow(unalt_sample_metadata))
  clinical_data<-rbind(alt_sample_metadata, unalt_sample_metadata)
  clinical_data<-clinical_data[!duplicated(clinical_data$PATIENT_ID),]
  clinical_data$OS_STATUS<-as.numeric(substr(clinical_data$OS_STATUS, 1, 1))
  SU2C_CIN_clincal_matrix[[CIN.name]]<-clinical_data
  
  clinical_data_KM<-clinical_data

  clinical_data_KM$OS_MONTHS <- as.numeric(clinical_data_KM$OS_MONTHS)
  BCR_curve<-Surv(clinical_data_KM$OS_MONTHS, clinical_data_KM$OS_STATUS)
  BCR_Sfit<-survfit(Surv(OS_MONTHS, OS_STATUS)~group, data = clinical_data_KM)

  su2c_prad_relapse<-ggsurvplot(BCR_Sfit, 
                                conf.int = F, 
                                pval = T, 
                                risk.table = T, 
                                legend.labs = c("altered", "unaltered"),
                                legend.title = "CIN signature genes",
                                xlab = "Time (month)",
                                title = paste0("SU2C Prostate cancer patients Overall survival \n ", CIN.name, " alteration"),
                                palette = c("orange", "#5BC0EB"),
                                font.tickslab = c(15),
                                risk.table.height = .25)

  print(su2c_prad_relapse)
}

Performing association analyses in SU2C/DREAM TEAM 2019 PNAS (mCRPC) dataset with CN-signature 3 associated genes

su2c_2019_clinical<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/prad_su2c_2019/prad_su2c_2019_clinical_data_cbioportal.tsv", sep = "\t", header = T)
su2c_2019_clinical<-su2c_2019_clinical[!duplicated(su2c_2019_clinical$Patient.ID),]
su2c_clinical_tumor_burden<-left_join(clinical_data, su2c_2019_clinical, by = c("PATIENT_ID" = "Patient.ID"))

genome_altered.stat.test <- su2c_clinical_tumor_burden %>%
  t_test(Fraction.Genome.Altered ~ group) %>%
  add_significance()
genome_altered.stat.test
## # A tibble: 1 × 9
##   .y.                  group1 group2    n1    n2 statistic    df      p p.signif
##   <chr>                <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl> <chr>   
## 1 Fraction.Genome.Alt… alter… unalt…    76   125      2.00  165. 0.0476 *
p_genome_altered<-ggplot(data = su2c_clinical_tumor_burden,
              aes(x = factor( group, levels = c("altered", "unaltered")),
                  y = Fraction.Genome.Altered,
                  fill = group,
                  color = group)) +
        theme_light() +
        geom_boxplot(alpha = 0.4) +
        geom_jitter(shape = 16, position = position_jitter(0.2)) +
        scale_fill_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
        scale_color_manual(values = c("altered" = "orange", "unaltered" = "#5BC0EB")) +
        stat_compare_means(label.y =1, label.x = 2, method = "t.test")+
        labs(title="Associate CIN signatures genes expression with PCa genome alteration fraction", x="patient groups", y = "Fraction Genome altered")+
        theme(plot.title = element_text(size=20,  face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), 
        axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15,  face = "bold"))

p_genome_altered

Associating CN-signature exposure score with MSI score in SU2C DREAM TEAM dataset

library(readxl)
su2c_dbgap_sample_id<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/phs000915.v2.p2.txt", sep = "\t", header = T)
#https://www.ncbi.nlm.nih.gov/gap/sstr/report/phs000915.v2.p2
msi_sample_id<-readxl::read_excel("/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/oncotarget-08-7452-s002.xlsx", sheet = "PRAD MSS (SU2C)")
msi_scores_prad<-readxl::read_excel("/xdisk/mpadi/jiawenyang/data/centrosome_loss/MSI_score/oncotarget-08-7452-s003.xlsx", sheet = "PRAD MSS")

su2c_clinical_data<-dplyr::left_join(su2c_clinical_data, msi_scores_prad, by = c("PATIENT_ID" = "Sample"))
su2c_clinical_data<-su2c_clinical_data[su2c_clinical_data$PATIENT_ID %in% msi_sample_id$`Submitted subject id`,]
Sig.CNV.seqz_cl_cells_prad_patients_with_clinical<-read.csv("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/copy_number_signatures_enrichment_results_with_clinical_info.csv")
Sig.CNV.seqz_cl_cells_prad_patients_with_clinical<-Sig.CNV.seqz_cl_cells_prad_patients_with_clinical[Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id %in% su2c_clinical_data$PATIENT_ID,]

su2c_clinical_data_with_cn_sig <- Sig.CNV.seqz_cl_cells_prad_patients_with_clinical[Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id %in% intersect(msi_scores_prad$Sample, Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id),]

msi_scores_prad_cn_sig <- msi_scores_prad[msi_scores_prad$Sample %in% intersect(msi_scores_prad$Sample, Sig.CNV.seqz_cl_cells_prad_patients_with_clinical$subject_id), ]
msi_scores_prad_cn_sig$Sample <- as.character(msi_scores_prad_cn_sig$Sample)

msi_scores_prad_cn_sig<-left_join(msi_scores_prad_cn_sig, su2c_clinical_data_with_cn_sig, by = c("Sample" = "subject_id"))
msi_scores_prad_cn_sig$enrich_sig <- paste0("CN-", msi_scores_prad_cn_sig$enrich_sig)

cn_sig1_sig3_test<-t.test(MSISensor ~ enrich_sig, data = msi_scores_prad_cn_sig[msi_scores_prad_cn_sig$enrich_sig %in% c("CN-Sig1" ,"CN-Sig3"),])

p_cn_sig_MSI<-ggplot(data = msi_scores_prad_cn_sig,
              aes(x = factor(enrich_sig, levels = c("CN-Sig1", "CN-Sig2", "CN-Sig3")),
                  y = MSISensor ,
                  fill = enrich_sig,
                  color = enrich_sig)) +
        theme_light() +
        geom_boxplot(alpha = 0.4) +
        geom_jitter(shape = 16, position = position_jitter(0.2)) +
        scale_fill_manual(labels = c("CN-Sig1", "CN-Sig2", "CN-Sig3"), values=c("#050506", "#DE8B36", "#CD4224")) +
        scale_color_manual(labels = c("CN-Sig1", "CN-Sig2", "CN-Sig3"), values=c("#050506", "#DE8B36", "#CD4224")) +
        labs(title="Associate CIN signatures genes expression with PCa sample MSI score", 
             x="CN-Signature groups", 
             y = "Microsatelight Instability (MSI) score")+
        theme(plot.title = element_text(size=20,  face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), 
        axis.title.x = element_text(size = 15,  face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15, face = "bold"))

p_cn_sig_MSI

cn_sig1_sig3_test
## 
##  Welch Two Sample t-test
## 
## data:  MSISensor by enrich_sig
## t = -2.2979, df = 52.653, p-value = 0.02557
## alternative hypothesis: true difference in means between group CN-Sig1 and group CN-Sig3 is not equal to 0
## 95 percent confidence interval:
##  -0.44379690 -0.03008928
## sample estimates:
## mean in group CN-Sig1 mean in group CN-Sig3 
##             0.1486667             0.3856098
CN_sig_score<-readRDS(file ="/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups.RDS")
CN_sig_score<-as.data.frame(t(CN_sig_score$Exposure))
CN_sig_score[,"sample"]<-rownames(CN_sig_score)
CN_sig_score[,"sample"]<-gsub("-SRR\\w+", "", CN_sig_score[,"sample"])

msi_scores_prad_cn_score<-left_join(msi_scores_prad_cn_sig, CN_sig_score, by = c("Sample" = "sample"))
msi_scores_prad_cn_score
## # A tibble: 57 × 24
##    Sample mSINGS MSISensor MANTIS     X sample group silhouette_width enrich_sig
##    <chr>   <dbl>     <dbl>  <dbl> <int> <chr>  <int>            <dbl> <chr>     
##  1 51150… 0.0381      0.87  0.239   176 SRR30…     1            0.992 CN-Sig3   
##  2 51150… 0.0495      0     0.266   311 SRR83…     2            0.992 CN-Sig1   
##  3 51150… 0.0254      0.2   0.269   310 SRR83…     2            0.992 CN-Sig1   
##  4 51150… 0.0386      0     0.252   247 SRR83…     2            0.992 CN-Sig1   
##  5 51150… 0.0269      0     0.265   175 SRR83…     1            0.992 CN-Sig3   
##  6 51150… 0.0313      0     0.251   309 SRR83…     2            0.992 CN-Sig1   
##  7 51150… 0.0263      0.25  0.257   174 SRR83…     1            0.992 CN-Sig3   
##  8 51150… 0.028       0     0.256   173 SRR83…     1            0.992 CN-Sig3   
##  9 51150… 0.0308      0     0.250   172 SRR83…     1            0.992 CN-Sig3   
## 10 51150… 0.0331      0     0.253   171 SRR83…     1            0.992 CN-Sig3   
## # ℹ 47 more rows
## # ℹ 15 more variables: tumor_Run <chr>, Study <chr>, sample_type <chr>,
## #   tumor_body_site <chr>, normal_Run <chr>, PatientID <chr>, Age <int>,
## #   Stage <chr>, PSA <dbl>, GleasonScore <int>, Fusion <chr>, CNV_ID <chr>,
## #   Sig1 <dbl>, Sig2 <dbl>, Sig3 <dbl>
ggplot(msi_scores_prad_cn_score, aes(x=Sig3, y=MANTIS, color = MANTIS)) + 
  geom_point(color = "#CD4224")+
  theme_light() +
  geom_smooth(method=lm, color = alpha("#CD4224", 0.4)) +
  stat_cor(method = "pearson", label.x = 100, label.y = 0.3 )+
  xlab("CN-sig3 exposure score") +                 
  ylab("Microsatelight Instability (MSI) score") +   
  ggtitle("CN-signature and MSI correlation") +
  theme(plot.title = element_text(size=20, face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10,  face = "bold"), 
        axis.title.x = element_text(size = 15,  face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10, face = "bold"), legend.title = element_text(size = 15,  face = "bold"))

ggplot(msi_scores_prad_cn_score, aes(x=Sig2, y=MANTIS, color = MANTIS)) + 
  geom_point(color = "#DE8B36")+
  theme_light() +
  geom_smooth(method=lm, color = alpha("#DE8B36", 0.4)) +
  stat_cor(method = "pearson", label.x = 10, label.y = 0.3) +
  xlab("CN-sig2 exposure score") +                 
  ylab("Microsatelight Instability (MSI) score") +   
  ggtitle("CN-signature and MSI correlation") +
  theme(plot.title = element_text(size=20,  face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10, face = "bold"), 
        axis.title.x = element_text(size = 15, face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10,  face = "bold"), legend.title = element_text(size = 15,  face = "bold"))

ggplot(msi_scores_prad_cn_score, aes(x=Sig1, y=MANTIS, color = MANTIS)) + 
  geom_point(color = "black")+
  theme_light() +
  geom_smooth(method=lm, color = alpha("black", 0.4)) +
  stat_cor(method = "pearson", label.x = 150, label.y = 0.3) +
  xlab("CN-sig1 exposure score") +                 
  ylab("Microsatelight Instability (MSI) score") +   
  ggtitle("CN-signature and MSI correlation") +
  theme(plot.title = element_text(size=20,  face = "bold"), 
        axis.text.x = element_text(size = 10,  face = "bold"), axis.text.y = element_text(size = 10,  face = "bold"), 
        axis.title.x = element_text(size = 15,  face = "bold"), axis.title.y = element_text(size = 15,  face = "bold"),
        legend.text = element_text(size = 10,  face = "bold"), legend.title = element_text(size = 15,  face = "bold"))

Back to top

back to home